O’Hara on GitHub


knitr::opts_chunk$set(fig.width = 6, fig.height = 4, fig.path = 'Figs/',
                      echo = TRUE, message = FALSE, warning = FALSE)

library(raster)
source('https://raw.githubusercontent.com/oharac/src/master/R/common.R')

library(sf)
library(data.table)

dir_git <- '~/github/spp_risk_dists'
dir_o_anx <- file.path(dir_O, 'git-annex/spp_risk_dists')

source(file.path(dir_git, 'data_setup/common_fxns.R'))

### goal specific folders and info

dir_data <- file.path(dir_git, 'data')

### Gall-Peters doesn't have an EPSG?
gp_proj4 <- '+proj=cea +lon_0=0 +lat_ts=45 +x_0=0 +y_0=0 +ellps=WGS84 +units=m +no_defs'

1 Summary

Using individual species range data (csv outputs from 5_process_spp_shps.Rmd), collect into species groups. In each species group, process range and risk information to get mean risk, variance of risk, species richness, and threatened species count per cell. These are all weighted by species range rarity as a metric of endemism.

2 Methods

2.1 Generate species group maps - range-rarity weighting per cell

  • Determine species ranges using ocean-area data.frame and cell location data.frame. Create an overall data.frame and save as .csv.
    • Compare to polygon range areas
  • Collect all taxon species into a single data.frame.
  • Join species extinction risk and trend data.frame to species-cell data.frame, including regional assessments.
  • Join species range area
  • By cell, calculate range-rarity-weighted mean extinction risk, variance of extinction risk, trend, threatened species count, rr_richness, rr_trend_richness (where rr_richness is range-rarity weighted species richness)
    • When aggregating further, use rr_richness to weight values.

NOTE: as above, because the summary files are likely to be very large for globe-spanning taxa, save these outputs outside of GitHub.

2.1.1 Calculate range from the rasterized range maps

This calculated range includes ONLY ocean area. For species such as birds whose range is primarily over land but extends somewhat out over the ocean, this will underpredict range according to the polygons. For marine species whose range is extremely tiny, the raster may miss the presence of a species entirely. These ranges will be checked against the polygon areas as a quality check.

spp_range_file <- file.path(dir_git, 'data',
                            sprintf('iucn_spp_range_area_%s.csv', api_version))
reload <- FALSE
if(!file.exists(spp_range_file) | reload == TRUE) {
  ### load area raster, convert to data frame
  area_rast <- raster(file.path(dir_git, 'spatial/ocean_area_rast.tif'))
  area_df <- data.frame(cell_id = 1:length(area_rast),
                        area_km2 = values(area_rast))
  
  ### load csvs of marine species range maps
  spp_maps <- read_csv(file.path(dir_data, sprintf('spp_marine_maps_%s.csv', api_version)),
                       col_types = 'ddciccc')
  spp_ids <- spp_maps$iucn_sid %>%
    unique() %>%
    sort()
  
  if(reload == TRUE & file.exists(spp_range_file)) {
    spp_range_list_done <- read_csv(spp_range_file, col_types = 'ddcc')
    spp_ids <- spp_ids[!spp_ids %in% spp_range_list_done$iucn_sid]
  }
  
  if(length(spp_ids) > 0) { ### any more spp to calculate?
    system.time({
    spp_range_list <- parallel::mclapply(spp_ids, mc.cores = 12,
                                         FUN = function(x) {
      ### x <- spp_ids[1]
      spp_map <- read_csv(file.path(dir_o_anx, 'spp_rasters',
                                    sprintf('iucn_sid_%s.csv', x)),
                          col_types = 'di')
      
      spp_map_area <- spp_map %>%
        # filter(presence != 5) %>% ### exclude extinct areas
        inner_join(area_df, by = 'cell_id')
      
      spp_range <- data.frame(iucn_sid  = x,
                              range_km2 = sum(spp_map_area$area_km2, na.rm = TRUE))
      # spp_range_list[[i]] <- spp_range
      return(spp_range)
    # }
    }) ### end of mclapply
    }) ### end of system.time
  
    spp_range_df <- bind_rows(spp_range_list) %>%
      left_join(spp_maps %>% 
                  select(iucn_sid, sciname, dbf_file) %>%
                  distinct(),
                by = 'iucn_sid')
  } else {
    spp_range_df <- data.frame() ### return an empty data.frame
  }
  
  if(reload == TRUE & exists('spp_range_list_done')) {
    ### add completed area calcs to new area calcs at the end of the dataframe
    spp_range_df <- spp_range_df %>%
      bind_rows(spp_range_list_done)
  }

  
  write_csv(spp_range_df, spp_range_file)
}

2.1.2 calculate range from polygons

This will overpredict range for many species whose range is limited by depth, due to polyon buffering. This can be used to sanity-check the raster-calculated ranges.

spp_maps <- read_csv(file.path(dir_data, sprintf('spp_marine_maps_%s.csv', api_version)),
                     col_types = 'ddciccc')

dir_shp <- file.path(dir_M, sprintf('git-annex/globalprep/_raw_data/iucn_spp/d%s', api_version))
dir_bli <- file.path(dir_M, 'git-annex/globalprep/_raw_data/birdlife_intl/d2018')

spp_dbfs <- spp_maps$dbf_file %>%
  unique() 
spp_shps <- spp_dbfs %>%
  str_replace('\\.dbf$', '.shp')
spp_shps <- ifelse(str_detect(spp_shps, 'bli'), 
                   file.path(dir_bli, spp_shps),
                   file.path(dir_shp, spp_shps))

polygon_areas_file <- file.path(dir_git, 'data_setup/int/polygon_areas.csv')

if(!file.exists(polygon_areas_file)) {

  poly_range_list <- parallel::mclapply(spp_shps, mc.cores = 12,
                          FUN = function(x) {
                            ### x <- spp_shps[1]
                            shp <- st_read(x)
                            shp <- shp %>%
                              filter(id_no %in% spp_maps$iucn_sid) ### NOTE: this will miss subpops
                            
                            range_df <- data.frame(iucn_sid = shp$id_no,
                                                   poly_range_km2 = st_area(shp)) %>%
                              mutate(poly_range_km2 = as.numeric(poly_range_km2 / 1e6))
                          })
  
  poly_range_df <- poly_range_list %>%
    setNames(basename(spp_shps)) %>%
    bind_rows(.id = 'shp_file')
  
  write_csv(poly_range_df, polygon_areas_file)
}

2.1.3 plot raster-calculated ranges vs polygon-calculated ranges

range_df <- read_csv(polygon_areas_file, col_types = 'dd') %>%
  full_join(read_csv(spp_range_file, col_types = 'ddcc') %>%
              mutate(shp_file = str_replace(dbf_file, 'dbf', 'shp')), 
            by = 'iucn_sid') %>%
  mutate(birds = str_detect(shp_file, 'bli'))

range_compare_scatter <- ggplot(range_df, aes(x = range_km2, y = poly_range_km2)) +
  ggtheme_plot() +
  geom_abline(slope = 1, intercept = 0, color = 'green4') + 
  geom_point(aes(color = birds, label = shp_file, label2 = iucn_sid), alpha = .6)

plotly::ggplotly(range_compare_scatter)

2.2 Aggregate species maps to taxa level

Species cell values for category and trend are multiplied by range rarity before summarizing to mean and variance. Rather than species counts, the sum of range rarity is calculated for each cell.

Since most of these values will be tiny (since range rarity is generally a tiny number for species with any significant area), we can later multiply these by a scaling factor to get something more in line with more convenient orders of magnitude.

### Read in lots of data
spp_maps <- read_csv(file.path(dir_data, sprintf('spp_marine_maps_%s.csv', api_version)),
                     col_types = 'ddciccc')

spp_risk <- read_csv(file.path(dir_data, sprintf('iucn_risk_current_%s.csv', api_version)),
                     col_types = 'dccicccdc')
spp_risk_rgn <- read_csv(file.path(dir_data, sprintf('iucn_risk_rgn_current_%s.csv', api_version)),
                         col_types = 'dcccdc')

spp_trend <- read_csv(file.path(dir_data, sprintf('iucn_trend_by_spp_%s.csv', api_version)),
                      col_types = 'dc_d_')

### load range info and ocean area info
spp_range <- read_csv(file.path(dir_git, 'data',
                                sprintf('iucn_spp_range_area_%s.csv', api_version)),
                      col_types = 'dd__')

# ocean_area_rast <- raster(file.path(dir_git, 'spatial/ocean_area_rast.tif'))
# cell_area_df <- data.frame(cell_ocean_area = values(ocean_area_rast),
#                            cell_id = 1:length(ocean_area_rast))

### make a dataframe of species risk, trend, regional risk, and species range
spp_risk_trend <- spp_risk %>%
  mutate(iucn_rgn = 'global') %>%
  bind_rows(spp_risk_rgn) %>%
  select(iucn_sid, iucn_rgn, cat_score) %>%
  left_join(spp_trend, by = c('iucn_sid', 'iucn_rgn')) %>%
  left_join(spp_range, by = 'iucn_sid')

### raster for cell IDs
rast_cell_ids <- raster(file.path(dir_git, 'spatial', 'cell_id_rast.tif'))

### Make a dataframe of cell ID to LME for regional assessments... also 
### make a lookup of LME to region
lme_rgns_rast <- raster::raster(file.path(dir_git, 'spatial', 'lme_rast.tif'))
lme_cells_all <- data.frame(cell_id = values(rast_cell_ids),
                            lme_id  = values(lme_rgns_rast)) %>%
  filter(!is.na(lme_id))
lme_to_rgn <- read_csv(file.path(dir_git, 'spatial/iucn_rgn_to_lme.csv'))


### Make a list of taxonomic groups to loop over:
taxa <- spp_maps$dbf_file %>%
  unique() %>%
  str_replace('\\.dbf$', '')
# taxa <- taxa[c(5, 22, 13, 25)]

##########################################################.
### Looping over taxonomic groups -----
##########################################################.
for(taxon in taxa) {
  ### taxon <- 'SEAGRASSES'
  ### taxon <- taxa[1]
  
  taxon_risk_sum_file <- file.path(dir_o_anx, 'taxa_summaries',
                                   sprintf('%s_cell_sum_rrweight_%s.csv', 
                                           tolower(taxon), api_version))
  
  reload <- FALSE
  if(!file.exists(taxon_risk_sum_file) | reload == TRUE) {
    
    taxon_maps <- spp_maps %>%
      filter(str_detect(dbf_file, taxon))
    
    taxon_risk_trend <- spp_risk_trend %>%
      filter(iucn_sid %in% taxon_maps$iucn_sid) %>%
      filter(!is.na(cat_score)) %>%
      arrange(iucn_sid)
    
    ### Using the iucn_sid field, generate a vector of all species range files for
    ### this taxon.
    taxon_ids <- taxon_risk_trend$iucn_sid %>%
      unique()
    
    message('Processing ', length(taxon_ids), ' species maps in ', taxon, '...')

    ##########################################################.
    ### Looping over species within group -----
    ##########################################################.
    ### Collect all species ranges for this taxon into a single data.frame.
    ### Use mclapply since we're reading many large-ish files.  For MARINE_MAMMALS (85 assessed spp)
    ### this takes about 30-40 seconds
    taxon_cells_list <- parallel::mclapply(taxon_ids, mc.cores = 24,
                                        FUN = function(x) {
        ### x <- taxon_ids[5]
        ### x <- 6336                            
        csv_file <- file.path(dir_o_anx, 'spp_rasters', 
                              sprintf('iucn_sid_%s.csv', x))
        
        spp_risk_map <- read_csv(csv_file, col_types = 'di') %>%
          mutate(iucn_sid = x) %>%
          left_join(taxon_risk_trend, by = 'iucn_sid') 
        
        ### Identify regional assessments if any
        lme_rgns <- lme_to_rgn %>%
          filter(iucn_rgn %in% spp_risk_map$iucn_rgn)
        
        non_global_rgns <- lme_rgns %>%
          filter(iucn_rgn != 'global')
        
        if(nrow(non_global_rgns) > 0) {
          ### If any regional assessments, clip the LME cells down to the appropriate region...
          lme_cells <- lme_cells_all %>%
            inner_join(lme_rgns, by = 'lme_id') %>%
            rename(rgn_name = iucn_rgn)
          ### ... then filter out non-matching overlapped cells
          spp_risk_map <- spp_risk_map %>%
            left_join(lme_cells, by = 'cell_id') %>%
            mutate(rgn_name = ifelse(is.na(lme_id), 'global', rgn_name),
                   priority = ifelse(rgn_name == 'global', 100, priority)) %>%
            filter(iucn_rgn == rgn_name) %>%
            group_by(cell_id) %>%
            filter(priority == min(priority)) %>%
            ungroup()
          ### NOTE: at this point, still possible to have multiple regional
          ### assessments, if priorities are the same (e.g. Europe and Pan Africa,
          ### or overlaps around Africa).  Those values will be averaged in
          ### the group_by() below.
        }
        
        ### calc range rarity and select down to main columns; also, 
        ### if presence == 5 (extinct), adjust category and trend scores.
        spp_risk_map <- spp_risk_map %>%
          # left_join(cell_area_df, by = 'cell_id') %>%
          select(cell_id, presence, iucn_sid, iucn_rgn, 
                 cat_score, trend_score, range_km2) %>% # , cell_ocean_area) %>%
          distinct() %>%
          mutate(range_rarity = ifelse(range_km2 > 0, 1 / range_km2, 0),
                 cat_score   = ifelse(presence == 5, 1, cat_score),
                 trend_score = ifelse(presence == 5, NA, trend_score)) %>%
          select(-range_km2) %>% # , cell_ocean_area) %>%
            ### drop variables no longer needed
          group_by(cell_id, iucn_sid) %>%
          summarize(cat_score   = mean(cat_score, na.rm = TRUE),
                    trend_score = mean(trend_score, na.rm = TRUE),
                    iucn_rgn = paste0(iucn_rgn, collapse = ','),
                    presence = paste0(presence, collapse = ','),
                    range_rarity = first(range_rarity)) %>%
          ungroup() %>%
          mutate(trend_score = ifelse(is.nan(trend_score), NA, trend_score))
        
        return(spp_risk_map)
      }) ### end of mclapply over all species in taxonomic group
  
    ##########################################################.
    ### Processing cell calculations for group -----
    ##########################################################.
  
    ### Set up for keyed data.table merging: key for iucn_sid, cell_id
    message('...binding rows into data.frame...')
    taxon_risk_map <- bind_rows(taxon_cells_list)
    
    message('...summarizing...')
    taxon_risk_summary <- taxon_risk_map %>%
      group_by(cell_id) %>%
      summarize(mean_risk = sum(cat_score * range_rarity) / sum(range_rarity), ### NA categories already filtered out
                sr_rr_risk  = sum(range_rarity),
                v1 = sum(range_rarity[!is.na(cat_score)]),   ### for recombining group variances
                v2 = sum(range_rarity[!is.na(cat_score)]^2), ### for recombining group variances
                alpha     = v1 - (v2 / v1), 
                var_risk  = sum(range_rarity * (cat_score - mean_risk)^2) / alpha,
                  ### see https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Weighted_sample_variance
                var_risk  = ifelse(is.nan(var_risk), NA, var_risk),
                  ### for cells with only one spp in the cell, returns NaN, reset to NA
                sr_rr_threatened = sum((cat_score >= 0.4 & cat_score < 1) * range_rarity),  
                mean_trend  = sum(trend_score * range_rarity, na.rm = TRUE) /
                  sum(range_rarity[!is.na(trend_score)]),
                mean_trend  = ifelse(is.nan(mean_trend), NA, mean_trend),
                sr_rr_trend = sum(range_rarity[!is.na(trend_score)]),
                n_spp = n()) %>%
      select(-alpha)
    
    message('...writing file', taxon_risk_sum_file, '...')
    write_csv(taxon_risk_summary, taxon_risk_sum_file)

  } else { ### end of if statement checking whether file exists for this taxon
    message('Found file ', taxon_risk_sum_file, '... skipping process...')
  }
  
} ### end of taxonomic group loop